Load data
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.1.1
## ✔ dials 1.2.0 ✔ tibble 3.2.1
## ✔ dplyr 1.1.2 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.1
## ✔ modeldata 1.1.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.0 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
Set parameter ranges based on value combinations used in experiments.
# Vectors of all possible conditions combinations
frs <- unique(CC.TotalData$Flow.rate)
chls <- unique(CC.TotalData$Chlorophyll)
guans <- unique(CC.TotalData$Guano)
lights <- unique(CC.TotalData$Light)
#frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
#chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
#guans <- as.numeric(as.character(unique(CC.TotalData$Guano)))
#lights <- as.numeric(as.character(unique(CC.TotalData$Light)))
#guans <- c(1,2)
#lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights)
Loop through combinations
for (i in 1:dim(conditions)[1])
{
velocity <- CC.TotalData$smooth.v[
(CC.TotalData$Flow.rate==conditions[i,1] &
CC.TotalData$Chlorophyll==conditions[i,2] &
as.numeric(CC.TotalData$Guano)==conditions[i,3] &
as.numeric(CC.TotalData$Light)==conditions[i,4])]
if (length(velocity) <= 1 | length(velocity[!is.na(velocity)])==0) {
conditions[i,5:9] <- NA } else {
# Because we're using the smoothed data, we'll average the velocities into 30-step bins
idx <- 1:length(velocity)
bx <- seq(from=0.5, to=length(velocity), by=30)
velocity <- binMeans(y = velocity, x = idx, bx = bx)
c<-cor.test(log10(velocity[1:length(velocity)-1]),
log10(velocity[2:length(velocity)]))
conditions[i,5] <- c$estimate
v0 <- log10(velocity[1:length(velocity)-1])
v1 <- log10(velocity[2:length(velocity)])
df <- data.frame(v0,v1)
fit <- lm(v1 ~ v0 + 1, data = df)
conditions[i,6] <- fit$coefficients[2]
conditions[i,7] <- fit$coefficients[1]
conditions[i,8] <- sd(fit$residuals)
d <- dip.test(velocity)
conditions[i,9] <- d$p.value
hist(log10(velocity),100,
main=d$p.value)
}
}
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
conditions <- setNames(conditions,c("flow.rate","chlorophyll","guano","light","corr.val","slope","intercept","sigma","dip.test"))
Using random forest to fit the missing values for autocorrelation
Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data
conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]
conditions.rf.slope <- randomForest(slope ~ .,
data=select(conditions_data,c(1,2,3,4,6)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.slope)
varImpPlot(conditions.rf.slope)
conditions_blank$slope <- predict(conditions.rf.slope,conditions_blank[,1:4])
fit.slope <- predict(conditions.rf.slope,conditions_data[,1:4])
plot(fit.slope,conditions_data$slope)
plot(conditions_blank$chlorophyll,conditions_blank$slope)
plot(conditions_blank$flow.rate,conditions_blank$slope)
plot(conditions_blank$light,conditions_blank$slope)
plot(conditions_blank$guano,conditions_blank$slope)
conditions.rf.intercept <- randomForest(intercept ~ .,
data=select(conditions_data,c(1,2,3,4,7)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.intercept)
varImpPlot(conditions.rf.intercept)
conditions_blank$intercept <- predict(conditions.rf.intercept,conditions_blank[,1:4])
fit.intercept <- predict(conditions.rf.intercept,conditions_data[,1:4])
plot(fit.intercept,conditions_data$intercept)
plot(conditions_blank$chlorophyll,conditions_blank$intercept)
plot(conditions_blank$flow.rate,conditions_blank$intercept)
plot(conditions_blank$light,conditions_blank$intercept)
plot(conditions_blank$guano,conditions_blank$intercept)
conditions.rf.sigma <- randomForest(sigma ~ .,
data=select(conditions_data,c(1,2,3,4,8)),
importance=TRUE,
proximity=TRUE)
plot(conditions.rf.sigma)
varImpPlot(conditions.rf.sigma)
conditions_blank$sigma <- predict(conditions.rf.sigma,conditions_blank[,1:4])
fit.sigma <- predict(conditions.rf.sigma,conditions_data[,1:4])
plot(fit.sigma,conditions_data$sigma)
plot(conditions_blank$chlorophyll,conditions_blank$sigma)
plot(conditions_blank$flow.rate,conditions_blank$sigma)
plot(conditions_blank$light,conditions_blank$sigma)
plot(conditions_blank$guano,conditions_blank$sigma)
Save fit models
save(conditions.rf.slope,conditions.rf.intercept,conditions.rf.sigma, file='notebook13-rf-2023.10.24data.RData')